5.0 10X Genomics PBMC 3K Dataset

In [1]:
from clustergrammer_widget import *
net = Network(clustergrammer_widget)
df = {}

import clustergrammer_groupby as cby
import gene_exp_10x
In [2]:
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
from copy import deepcopy

import matplotlib.pyplot as plt
%matplotlib inline 

Load Data

In [3]:
df['ge-ini'] = gene_exp_10x.load_gene_exp_to_df('../data/pbmc3k_filtered_gene_bc_matrices/hg19/')
df['ge-ini'].shape
Out[3]:
(32738, 2700)
In [4]:
all_genes = df['ge-ini'].index.tolist()
print(len(all_genes))
keep_genes = [x for x in all_genes if 'RPL' not in x]
keep_genes = [x for x in keep_genes if 'RPS' not in x]
print(len(keep_genes))

df['ge'] = df['ge-ini'].loc[keep_genes]
df['ge'].shape

# Removing Mitochondrial Genes
list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',
                'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']

all_genes = df['ge'].index.tolist()
mito_genes = [x for x in all_genes if 'MT-' == x[:3] or 
             x.split('_')[0] in list_mito_genes]
print(mito_genes)

keep_genes = [x for x in all_genes if x not in mito_genes]
df['ge'] = df['ge'].ix[keep_genes]

# normalize by UMI count
barcode_umi_sum = df['ge'].sum()
df['ge'] = df['ge'].div(barcode_umi_sum)
32738
32546
['MTRNR2L11_3066', 'MTRNR2L12_6165', 'MTRNR2L13_7998', 'MTRF1L_11630', 'MTRNR2L6_13036', 'MTRNR2L10_13646', 'MTRNR2L7_17194', 'MTRNR2L5_17355', 'MTRNR2L8_18439', 'MTRF1_21974', 'MTRNR2L4_24777', 'MTRNR2L1_26599', 'MTRNR2L3_29240', 'MT-ND1_32696', 'MT-ND2_32697', 'MT-CO1_32698', 'MT-CO2_32699', 'MT-ATP8_32700', 'MT-ATP6_32701', 'MT-CO3_32702', 'MT-ND3_32703', 'MT-ND4L_32704', 'MT-ND4_32705', 'MT-ND5_32706', 'MT-ND6_32707', 'MT-CYB_32708']
/Users/nickfernandez/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel/__main__.py:20: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
In [5]:
net.load_df(df['ge'])
net.normalize(axis='row', norm_type='zscore')
net.swap_nan_for_zero()
df['ge-z'] = net.export_df()
df['ge-z'].shape
Out[5]:
(32520, 2700)

Visualize Original Dataset

In [6]:
net.load_df(df['ge'])
net.filter_N_top(inst_rc='row', N_top=250, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
net.random_sample(axis='col', num_samples=250, random_state=99)
net.clip(lower=-5, upper=5)
net.cluster()
net.widget()
Widget Javascript not detected.  It may not be installed or enabled properly.

Load NM'3337 gene sigantures

In [7]:
net.load_file('../data/cell_type_signatures/nm3337_broad_cell_type_sigs.txt')
df['bct-sig'] = net.export_df()
print(df['bct-sig'].shape)

net.load_file('../data/cell_type_signatures/nm3337_narrow_cell_type_sigs.txt')
df['nct-sig'] = net.export_df()
print(df['nct-sig'].shape)
(523, 9)
(523, 22)
In [8]:
sig_rows = df['bct-sig'].index.tolist()
clean_sig_rows = [x.split('_')[0] for x in sig_rows]
print(len(clean_sig_rows), len(list(set(clean_sig_rows))))
523 523
In [9]:
ge_rows = df['ge'].index.tolist()
clean_ge_rows = [x.split('_')[0] for x in ge_rows]
print(len(ge_rows), len(list(set(clean_ge_rows))))
32520 32425
In [10]:
ser_ge_rows = pd.Series(clean_ge_rows)
In [11]:
gene_name_count = ser_ge_rows.value_counts(ascending=False)
duplicate_genes = gene_name_count[gene_name_count > 1].index.tolist()
len(duplicate_genes)
Out[11]:
91

only add unique index to duplicate genes

In [12]:
dup_index = {}
new_rows = []
for inst_row in clean_ge_rows:
    
    # add index to non-unique genes
    if inst_row in duplicate_genes:
        
        # calc non-unique index
        if inst_row not in dup_index:
            dup_index[inst_row] = 1
        else:
            dup_index[inst_row] = dup_index[inst_row]  + 1
            
        new_row = inst_row + '_' + str(dup_index[inst_row])
        
    else:
        new_row = inst_row
        
    new_rows.append(new_row)
In [13]:
print(len(new_rows))
print(len(list(set(new_rows))))
32520
32520
In [14]:
df['ge-z'].index = new_rows

Predict Cell Types using NM3337 Signatures

In [15]:
rows = df['nct-sig'].index.tolist()
new_rows = [x.split('_')[0] for x in rows]
df['nct-sig'].index = new_rows
In [16]:
df['nct-sig'].columns.tolist()
Out[16]:
[('B cells naive', 'B cells naive'),
 ('B cells memory', 'B cells memory'),
 ('Plasma cells', 'Plasma cells'),
 ('T cells CD8', 'T cells CD8'),
 ('T cells CD4 naive', 'T cells CD4 naive'),
 ('T cells CD4 memory resting', 'T cells CD4 memory resting'),
 ('T cells CD4 memory activated', 'T cells CD4 memory activated'),
 ('T cells follicular helper', 'T cells follicular helper'),
 ('T cells regulatory (Tregs)', 'T cells regulatory (Tregs)'),
 ('T cells gamma delta', 'T cells gamma delta'),
 ('NK cells resting', 'NK cells resting'),
 ('NK cells activated', 'NK cells activated'),
 ('Monocytes', 'Monocytes'),
 ('Macrophages M0', 'Macrophages M0'),
 ('Macrophages M1', 'Macrophages M1'),
 ('Macrophages M2', 'Macrophages M2'),
 ('Dendritic cells resting', 'Dendritic cells resting'),
 ('Dendritic cells activated', 'Dendritic cells activated'),
 ('Mast cells resting', 'Mast cells resting'),
 ('Mast cells activated', 'Mast cells activated'),
 ('Eosinophils', 'Eosinophils'),
 ('Neutrophils', 'Neutrophils')]
In [17]:
rows = df['bct-sig'].index.tolist()
new_rows = [x.split('_')[0] for x in rows]
df['bct-sig'].index = new_rows
In [18]:
# rows = df['ge-z'].index.tolist()
# new_rows = [x.split('_')[0] for x in rows]
# df['ge-z'].index = new_rows
In [19]:
df['pred_cat'], df['sig_sim'], y_info = cby.predict_cats_from_sigs(df['ge-z'], df['bct-sig'], 
                                                                   predict_level='Cell Type', unknown_thresh=0.05)
In [20]:
net.load_df(df['pred_cat'])
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: T cells CD8', inst_color='red')
net.random_sample(axis='col', num_samples=250, random_state=99)
net.clip(lower=-5, upper=5)
net.cluster()
net.widget()
Widget Javascript not detected.  It may not be installed or enabled properly.
In [21]:
df['ge-cat'] = deepcopy(df['ge'])
df['ge-cat'].shape
Out[21]:
(32520, 2700)
In [22]:
# transfer predicted categories to full dataset and add UMI count
cat_cols = df['pred_cat'].columns.tolist()
df['ge-cat'].columns = cat_cols

new_cols = [(x[0], x[1], 'UMI: ' + str(barcode_umi_sum[x[0]])) for x in cat_cols]

df['ge-cat-umi'] = deepcopy(df['ge-cat'])
df['ge-cat-umi'].columns = new_cols
print(df['ge-cat-umi'].shape)
(32520, 2700)
In [23]:
net.load_df(df['ge-cat-umi'])
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: T cells CD8', inst_color='red')
net.filter_N_top(inst_rc='row', N_top=250, rank_type='var')
net.random_sample(axis='col', num_samples=250, random_state=99)
net.normalize(axis='row', norm_type='zscore')
net.clip(lower=-5, upper=5)
net.cluster()
net.widget()
Widget Javascript not detected.  It may not be installed or enabled properly.